Installing/ loading libraries

if(!require("quanteda")) {install.packages("quanteda"); library("quanteda")}
if(!require("lubridate")) {install.packages("readtext"); library("readtext")}
if(!require("tidyverse")) {install.packages("tidyverse"); library("tidyverse")}
if(!require("pdftools")) {install.packages("pdftools"); library("pdftools")}
if(!require("haven")) {install.packages("haven"); library("haven")}
if(!require("parameters")) {install.packages("parameters"); library("parameters")}
if(!require("performance")) {install.packages("performance"); library("performance")}
if(!require("see")) {install.packages("see"); library("see")}

theme_set(theme_light())

Data Wrangling of Data set: 1/3 submission_firstround

Task A. Extract text from pdfs in the zip-folder of Consultation Round 1/3

## here() starts at C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining
## [1] 435
#(II)
#unused<<<<<<<<<<<<<<<<<<<<<
#result <-lapply(file.list, FUN = function(files) {
#  pdf_text(files)
#})
#result <- lapply(file.list, pdftools::pdf_text)
#<<<<<<<<<<<<<<<<<<<<<<<<<<<<
#make a table with 2 columns: the doc name & pdf-content
# combine text from each pdf into one string with paste0

filestextDF <- data.frame(Document = file.list,
                         text = sapply(file.list, function(x) 
                                 paste0(pdf_text(x), collapse = ' ')))

tb_pdf <- as_tibble(filestextDF)
#extract ids (of length of 7 characters) for the docs to match with metadata later
#N= 435 docs
#Problem: multiple docs for same id (= multiple docs by same submitter)

tb_pdf$Document <- str_remove(tb_pdf$Document, "C:/Users/batzdova/Desktop/EC-Web-Scrapping-and-Text-Mining/Data/Public_consultation_2020/files/")
ids <-substr(tb_pdf$Document, 1,7)

tb_pdf$id <- ids
library(readr)
Public_consultation_2020 <- read_delim("./Data/Public_consultation_2020/files/Public_consultation_2020.csv", 
                                       
    delim = ";", escape_double = FALSE, trim_ws = TRUE)

consult_meta <- as_tibble(Public_consultation_2020)
temp <- left_join(consult_meta, tb_pdf, by = c("Reference" = "id")) %>% as_tibble()
temp %>%  filter(is.na(text)) #observations without pdf text

Recoding the survey (First submission round)

#drop variables
#var column nr. 73: temp[,73]
temp <- temp %>% 
  dplyr::select(! `You can upload a document here:\n\n` ) %>% 
  dplyr::select(! `Publication privacy settings` )

#renaming variables

temp <- temp %>% 
  rename(filename = Document,
         country = Country,
         org = `Organisation name`,
         id = Reference,
         time =  `Feedback date`,
         lang = Language,
         type = `User type`,
         firstname = `First name`,
         surname = Surname,
         scope = Scope,
         register = `Transparency register number`,
         size = `Organisation size`) %>% 
  rename_with (~ 'coop_member_states', matches('Working with Member states')) %>% 
  rename_with (~ 'research_innov', matches('Focussing the efforts of the research and innovation community')) %>% 
  rename_with (~ 'skills', matches('\n: Skills')) %>% 
  rename_with (~ 'SME', matches('\n: Focus on SMEs')) %>% 
  rename_with (~ 'private_sector', matches('\n: Partnership with the private sector')) %>%   
  rename_with (~ 'public_sector', matches('\n: Promoting the adoption of AI by the public sector')) %>% 
  rename_with (~ 'other_action', matches('other actions that should be considered?')) %>% 
  rename_with (~ 'excel_research', matches('\n: Strengthen excellence in research')) %>% 
  rename_with (~ 'testing_fac', matches('Establish world-reference testing facilities for AI')) %>%  
  rename_with (~ 'uptake_ai', matches('Promote the uptake of AI by business and the public sector')) %>%        
  rename_with (~ 'startup_finance', matches('Increase the financing for start-ups innovating in AI')) %>% 
  rename_with (~ 'training_skills', matches('Develop skills for AI and adapt existing training programmes')) %>% 
  rename_with (~ 'eu_data_space', matches('Build up the European data space'))  %>% 
  rename_with (~ 'other_area', matches('Are there other areas that that should be considered')) %>%
  rename_with (~ 'lighthouse', matches('Support the establishment of a lighthouse research centre that is world class and able to attract the best minds')) %>% 
  rename_with (~ 'net_centres', matches('Network of existing AI research excellence centres')) %>% 
  rename_with (~ 'partner_research', matches('Set up a public-private partnership for industrial research'))  %>%   rename_with (~ 'action_research', matches('actions to strengthen the research and innovation community that should be given a priority')) %>% 
  rename_with (~ 'benefits_ai', matches('Help to raise SME’s awareness about potential benefits of AI')) %>%   
  rename_with (~ 'access_testing', matches('Provide access to testing and reference facilities')) %>%   
  rename_with (~ 'knowhow_transfer', matches('Promote knowledge transfer and support the development of AI expertise for SMEs')) %>%  
  rename_with (~ 'partner_aiproject', matches('Support partnerships between SMEs, larger enterprises and academia around AI projects')) %>%  
  rename_with (~ 'equity_finance', matches('Provide information about equity financing for AI startups')) %>%  
  rename_with (~ 'tasks_innovhub', matches('important for specialised Digital Innovations Hubs')) %>%
  rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>% 
  rename_with (~ 'concern_rights', matches('AI may breach fundamental rights'))  %>% 
  rename_with (~ 'concern_safety', matches('AI may endanger safety')) %>% 
  rename_with (~ 'concern_discrim', matches('The use of AI may lead to discriminatory outcomes')) %>% 
  rename_with (~ 'concern_explain', matches('AI may take actions for which the rationale cannot be explained')) %>% 
  rename_with (~ 'concern_compensat', matches('AI may make it more difficult for persons having suffered harm to obtain compensation'))  %>% 
  rename_with (~ 'concern_accuracy', matches('AI is not always accurate')) %>%   
  rename_with (~ 'concern_other', matches('Do you have any other concerns about AI that are not mentioned')) %>%
  rename_with (~ 'leg_rules', matches('Do you think that the concerns expressed above can be addressed by applicable EU legislation')) 
names(temp)[44]<- "rules_other"
names(temp)[45]<- "rules_highrisk"
names(temp)[46]<- "mitigate_other"
names(temp)[47]<- "highrisk_approach"
names(temp)[48]<- "highrisk_other"
names(temp)[49]<- "highrisk_app"
names(temp)[50]<- "requir_qual_training_data"
names(temp)[51]<- "requir_record_data"
names(temp)[52]<- "requir_purpose"
names(temp)[53]<- "requir_robust_acc"
names(temp)[54]<- "requir_human_oversight"
names(temp)[55]<- "requir_liability"
names(temp)[56]<- "requir_biometric"

names(temp)[57]<- "requir_spec"
names(temp)[58]<- "label_aisystem"
names(temp)[59]<- "label_suggest"
names(temp)[60]<- "trust_spec"
names(temp)[61]<- "trust_enforce"
names(temp)[62]<- "compliance_spec"
names(temp)[63]<- "risk_spec"
names(temp)[64]<- "risk_reform"
names(temp)[65]<- "reform_assess"
names(temp)[65]<- "risk_procedure"
names(temp)[66]<- "risk_other"
names(temp)[67]<- "liability_reform"
names(temp)[68]<- "liabilty_further"
names(temp)[69]<- "liability_national"
names(temp)[70]<- "liabilty_app"
names(temp)[71]<- "liabilty_other"
#cooperation member states (Likert scale 1-5 (not important - very important))
temp <- temp %>% 
  mutate(coop_member_states = case_when(
    coop_member_states == "5 - Very important" ~ 5,
    coop_member_states == "4 - Important" ~ 4,
    coop_member_states == "3 - Neutral" ~ 3,
    coop_member_states == "2 - Not important" ~ 2,
    coop_member_states == "1 - Not important at all" ~ 1,
    coop_member_states == "No opinion" ~ 0)
    )
c(summary(temp$coop_member_states)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$coop_member_states, na.rm = FALSE)) %>% round(digits = 2)
## Min. Max. Mean   sd 
## 0.00 5.00 4.28   NA
hist(temp$coop_member_states, breaks = 60)

# recoding research innovation focus (research_innov)
temp <- temp %>% 
  mutate(research_innov = case_when(
    research_innov == "5 - Very important" ~ 5,
    research_innov == "4 - Important" ~ 4,
    research_innov == "3 - Neutral" ~ 3,
    research_innov == "2 - Not important" ~ 2,
    research_innov == "1 - Not important at all" ~ 1,
    research_innov == "No opinion" ~ 0)
    )
#recoding skill (skills)
temp <- temp %>% 
  mutate(skills = case_when(
    skills == "5 - Very important" ~ 5,
    skills == "4 - Important" ~ 4,
    skills == "3 - Neutral" ~ 3,
    skills == "2 - Not important" ~ 2,
    skills == "1 - Not important at all" ~ 1,
    skills == "No opinion" ~ 0)
    )
#recoding SME (SME)
temp <- temp %>% 
  mutate(SME = case_when(
    SME == "5 - Very important" ~ 5,
    SME == "4 - Important" ~ 4,
    SME == "3 - Neutral" ~ 3,
    SME == "2 - Not important" ~ 2,
    SME == "1 - Not important at all" ~ 1,
    SME == "No opinion" ~ 0)
    )
#partnership w. private sector (private_sector)
temp <- temp %>% 
  mutate(private_sector = case_when(
    private_sector == "5 - Very important" ~ 5,
    private_sector == "4 - Important" ~ 4,
    private_sector == "3 - Neutral" ~ 3,
    private_sector == "2 - Not important" ~ 2,
    private_sector == "1 - Not important at all" ~ 1,
    private_sector == "No opinion" ~ 0)
    )
#partnership w. public_sector (public_sector)
temp <- temp %>% 
  mutate(public_sector = case_when(
    public_sector == "5 - Very important" ~ 5,
    public_sector == "4 - Important" ~ 4,
    public_sector == "3 - Neutral" ~ 3,
    public_sector == "2 - Not important" ~ 2,
    public_sector == "1 - Not important at all" ~ 1,
    public_sector == "No opinion" ~ 0)
    )
#Strengthen excellence in research (excel_research)
temp <- temp %>% 
  mutate(excel_research = case_when(
    excel_research == "5 - Very important" ~ 5,
    excel_research == "4 - Important" ~ 4,
    excel_research == "3 - Neutral" ~ 3,
    excel_research == "2 - Not important" ~ 2,
    excel_research == "1 - Not important at all" ~ 1,
    excel_research == "No opinion" ~ 0)
    )
#Establish world-reference testing facilities for AI (testing_fac)
temp <- temp %>% 
  mutate(testing_fac = case_when(
    testing_fac == "5 - Very important" ~ 5,
    testing_fac == "4 - Important" ~ 4,
    testing_fac == "3 - Neutral" ~ 3,
    testing_fac == "2 - Not important" ~ 2,
    testing_fac == "1 - Not important at all" ~ 1,
    testing_fac == "No opinion" ~ 0)
    )
#Promote the uptake of AI by business and the public sector (uptake_ai)
temp <- temp %>% 
  mutate(uptake_ai = case_when(
    uptake_ai == "5 - Very important" ~ 5,
    uptake_ai == "4 - Important" ~ 4,
    uptake_ai == "3 - Neutral" ~ 3,
    uptake_ai == "2 - Not important" ~ 2,
    uptake_ai == "1 - Not important at all" ~ 1,
    uptake_ai == "No opinion" ~ 0)
    )
#Increase the financing for start-ups innovating in AI (startup_finance)
temp <- temp %>% 
  mutate(startup_finance = case_when(
    startup_finance == "5 - Very important" ~ 5,
    startup_finance == "4 - Important" ~ 4,
    startup_finance == "3 - Neutral" ~ 3,
    startup_finance == "2 - Not important" ~ 2,
    startup_finance == "1 - Not important at all" ~ 1,
    startup_finance == "No opinion" ~ 0)
    )
c(summary(temp$startup_finance)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$startup_finance, na.rm = FALSE)) %>% round(digits = 2)
## Min. Max. Mean   sd 
## 0.00 5.00 3.72   NA
hist(temp$startup_finance, breaks = 60)

#Develop skills for AI and adapt existing training programmes (training_skills)
temp <- temp %>% 
  mutate(training_skills = case_when(
    training_skills == "5 - Very important" ~ 5,
    training_skills == "4 - Important" ~ 4,
    training_skills == "3 - Neutral" ~ 3,
    training_skills == "2 - Not important" ~ 2,
    training_skills == "1 - Not important at all" ~ 1,
    training_skills == "No opinion" ~ 0)
    )
#Build up the European data space (eu_data_space)
temp <- temp %>% 
  mutate(eu_data_space = case_when(
    eu_data_space == "5 - Very important" ~ 5,
    eu_data_space == "4 - Important" ~ 4,
    eu_data_space == "3 - Neutral" ~ 3,
    eu_data_space == "2 - Not important" ~ 2,
    eu_data_space == "1 - Not important at all" ~ 1,
    eu_data_space == "No opinion" ~ 0)
    )
#establishment of a lighthouse research centre (lighthouse)
temp <- temp %>% 
  mutate(lighthouse = case_when(
    lighthouse == "5 - Very important" ~ 5,
    lighthouse == "4 - Important" ~ 4,
    lighthouse == "3 - Neutral" ~ 3,
    lighthouse == "2 - Not important" ~ 2,
    lighthouse == "1 - Not important at all" ~ 1,
    lighthouse == "No opinion" ~ 0)
    )
#Network of existing AI research excellence centres (net_centres)
temp <- temp %>% 
  mutate(net_centres = case_when(
    net_centres == "5 - Very important" ~ 5,
    net_centres == "4 - Important" ~ 4,
    net_centres == "3 - Neutral" ~ 3,
    net_centres == "2 - Not important" ~ 2,
    net_centres == "1 - Not important at all" ~ 1,
    net_centres == "No opinion" ~ 0)
    )
#Set up a public-private partnership for industrial research (partner_research)
temp <- temp %>% 
  mutate(partner_research = case_when(
    partner_research == "5 - Very important" ~ 5,
    partner_research == "4 - Important" ~ 4,
    partner_research == "3 - Neutral" ~ 3,
    partner_research == "2 - Not important" ~ 2,
    partner_research == "1 - Not important at all" ~ 1,
    partner_research == "No opinion" ~ 0)
    )
#SMEs awareness about potential benefits of AI (benefits_ai)
temp <- temp %>% 
  mutate(benefits_ai = case_when(
    benefits_ai == "5 - Very important" ~ 5,
    benefits_ai == "4 - Important" ~ 4,
    benefits_ai == "3 - Neutral" ~ 3,
    benefits_ai == "2 - Not important" ~ 2,
    benefits_ai == "1 - Not important at all" ~ 1,
    benefits_ai == "No opinion" ~ 0)
    )
#Provide access to testing and reference facilities(access_testing)
temp <- temp %>% 
  mutate(access_testing = case_when(
    access_testing == "5 - Very important" ~ 5,
    access_testing == "4 - Important" ~ 4,
    access_testing == "3 - Neutral" ~ 3,
    access_testing == "2 - Not important" ~ 2,
    access_testing == "1 - Not important at all" ~ 1,
    access_testing == "No opinion" ~ 0)
    )
#Promote knowledge transfer and support the development of AI expertise for SMEs(knowhow_transfer)
temp <- temp %>% 
  mutate(knowhow_transfer = case_when(
    knowhow_transfer == "5 - Very important" ~ 5,
    knowhow_transfer == "4 - Important" ~ 4,
    knowhow_transfer== "3 - Neutral" ~ 3,
    knowhow_transfer == "2 - Not important" ~ 2,
    knowhow_transfer == "1 - Not important at all" ~ 1,
    knowhow_transfer == "No opinion" ~ 0)
    )
#partnerships between SMEs, larger enterprises and academia around AI projects(partner_aiproject)
temp <- temp %>% 
  mutate(partner_aiproject = case_when(
    partner_aiproject == "5 - Very important" ~ 5,
    partner_aiproject == "4 - Important" ~ 4,
    partner_aiproject == "3 - Neutral" ~ 3,
    partner_aiproject == "2 - Not important" ~ 2,
    partner_aiproject == "1 - Not important at all" ~ 1,
    partner_aiproject == "No opinion" ~ 0)
    )
#information about equity financing for AI startups(equity_finance)
temp <- temp %>% 
  mutate(equity_finance = case_when(
    equity_finance == "5 - Very important" ~ 5,
    equity_finance == "4 - Important" ~ 4,
    equity_finance == "3 - Neutral" ~ 3,
    equity_finance == "2 - Not important" ~ 2,
    equity_finance == "1 - Not important at all" ~ 1,
    equity_finance == "No opinion" ~ 0)
    )
#AI may endanger safety (concern_safety)
temp <- temp %>% 
  mutate(concern_safety = case_when(
    concern_safety == "5 - Very important" ~ 5,
    concern_safety == "4 - Important" ~ 4,
    concern_safety == "3 - Neutral" ~ 3,
    concern_safety == "2 - Not important" ~ 2,
    concern_safety == "1 - Not important at all" ~ 1,
    concern_safety == "No opinion" ~ 0)
    )
#AI may breach fundamental rights (such as human dignity, privacy, data protection (concern_rights)
temp <- temp %>% 
  mutate(concern_rights = case_when(
    concern_rights == "5 - Very important" ~ 5,
    concern_rights == "4 - Important" ~ 4,
    concern_rights == "3 - Neutral" ~ 3,
    concern_rights == "2 - Not important" ~ 2,
    concern_rights == "1 - Not important at all" ~ 1,
    concern_rights == "No opinion" ~ 0)
    )
#AI may lead to discriminatory outcomes (concern_discrim)
temp <- temp %>% 
  mutate(concern_discrim = case_when(
    concern_discrim == "5 - Very important" ~ 5,
    concern_discrim == "4 - Important" ~ 4,
    concern_discrim == "3 - Neutral" ~ 3,
    concern_discrim == "2 - Not important" ~ 2,
    concern_discrim == "1 - Not important at all" ~ 1,
    concern_discrim == "No opinion" ~ 0)
    )
#actions for which the rationale cannot be explained (concern_explain)
temp <- temp %>% 
  mutate(concern_explain = case_when(
    concern_explain == "5 - Very important" ~ 5,
    concern_explain == "4 - Important" ~ 4,
    concern_explain == "3 - Neutral" ~ 3,
    concern_explain == "2 - Not important" ~ 2,
    concern_explain == "1 - Not important at all" ~ 1,
    concern_explain == "No opinion" ~ 0)
    )
#difficult to obtain compensation(concern_compensat)
temp <- temp %>% 
  mutate(concern_compensat = case_when(
    concern_compensat == "5 - Very important" ~ 5,
    concern_compensat == "4 - Important" ~ 4,
    concern_compensat == "3 - Neutral" ~ 3,
    concern_compensat == "2 - Not important" ~ 2,
    concern_compensat == "1 - Not important at all" ~ 1,
    concern_compensat == "No opinion" ~ 0)
    )
#AI is not always accurate (concern_accuracy)
temp <- temp %>% 
  mutate(concern_accuracy = case_when(
    concern_accuracy == "5 - Very important" ~ 5,
    concern_accuracy == "4 - Important" ~ 4,
    concern_accuracy == "3 - Neutral" ~ 3,
    concern_accuracy == "2 - Not important" ~ 2,
    concern_accuracy == "1 - Not important at all" ~ 1,
    concern_accuracy == "No opinion" ~ 0)
    )
#quality of training data sets (requir_qual_training_data)
temp <- temp %>% 
  mutate(requir_qual_training_data = case_when(
    requir_qual_training_data == "5 - Very important" ~ 5,
    requir_qual_training_data == "4 - Important" ~ 4,
    requir_qual_training_data == "3 - Neutral" ~ 3,
    requir_qual_training_data == "2 - Not important" ~ 2,
    requir_qual_training_data == "1 - Not important at all" ~ 1,
    requir_qual_training_data == "No opinion" ~ 0)
    )
#keeping of records and data(requir_record_data)
temp <- temp %>% 
  mutate(requir_record_data = case_when(
    requir_record_data == "5 - Very important" ~ 5,
    requir_record_data == "4 - Important" ~ 4,
    requir_record_data == "3 - Neutral" ~ 3,
    requir_record_data == "2 - Not important" ~ 2,
    requir_record_data == "1 - Not important at all" ~ 1,
    requir_record_data == "No opinion" ~ 0)
    )
# Info on the purpose and the nature of AI systems (requir_purpose)
temp <- temp %>% 
  mutate(requir_purpose = case_when(
    requir_purpose == "5 - Very important" ~ 5,
    requir_purpose == "4 - Important" ~ 4,
    requir_purpose == "3 - Neutral" ~ 3,
    requir_purpose == "2 - Not important" ~ 2,
    requir_purpose == "1 - Not important at all" ~ 1,
    requir_purpose == "No opinion" ~ 0)
    )
# Robustness and accuracy of AI systems (requir_robust_acc)
temp <- temp %>% 
  mutate(requir_robust_acc = case_when(
    requir_robust_acc == "5 - Very important" ~ 5,
    requir_robust_acc == "4 - Important" ~ 4,
    requir_robust_acc == "3 - Neutral" ~ 3,
    requir_robust_acc == "2 - Not important" ~ 2,
    requir_robust_acc == "1 - Not important at all" ~ 1,
    requir_robust_acc == "No opinion" ~ 0)
    )
#Clear liability and safety rules (requir_liability)
temp <- temp %>% 
  mutate(requir_liability = case_when(
    requir_liability == "5 - Very important" ~ 5,
    requir_liability == "4 - Important" ~ 4,
    requir_liability == "3 - Neutral" ~ 3,
    requir_liability == "2 - Not important" ~ 2,
    requir_liability == "1 - Not important at all" ~ 1,
    requir_liability == "No opinion" ~ 0)
    )
#Human oversight (requir_human_oversight)
temp <- temp %>% 
  mutate(requir_human_oversight = case_when(
    requir_human_oversight == "5 - Very important" ~ 5,
    requir_human_oversight == "4 - Important" ~ 4,
    requir_human_oversight == "3 - Neutral" ~ 3,
    requir_human_oversight == "2 - Not important" ~ 2,
    requir_human_oversight == "1 - Not important at all" ~ 1,
    requir_human_oversight == "No opinion" ~ 0)
    )
c(summary(temp$requir_human_oversight)[c("Min.", "Max.", "Mean")], "sd" = sd(temp$requir_human_oversight, na.rm = FALSE)) %>% round(digits = 2)
## Min. Max. Mean   sd 
## 0.00 5.00 4.39   NA
hist(temp$requir_human_oversight, breaks = 60)

Merged data frame of first consulation round

tidy_df1 <-temp %>% unite("person", firstname:surname, sep = " ")

#add column indicative for first consultation round
tidy_df1 <- tidy_df1 %>% 
  mutate(consult_round = "one")
tidy_df1 <- tidy_df1 %>%  
  mutate(type = recode(type, #old value = new value
                    `NGO (Non-governmental organisation)` = "Non-governmental organisation (NGO)",
                    `Academic/Research Institution` = "Academic/research Institution",
                    `EU Citizen` = "EU citizen" ,
                    `Company/Business organisation` = "Company/business organisation",
                    `Consumer Organisation` = "Consumer organisation",
                    `Trade Union` = "Trade union",
                    `Business Association` = "Business association"
                    ))
tidy_df1 <- tidy_df1 %>%  
  mutate(size = recode(size, #old value = new value
                    `Medium (< 250 employees)` = "Medium (50 to 249 employees)",
                    `Small (< 50 employees)` = "Small (10 to 49 employees)",
                    `Micro (< 10 employees)` = "Micro (1 to 9 employees)"))

Second round of Consultations roadmap_2020 and final round comission_adoption_2021

library(readr)
commission_adoption_2021 <- read_csv("./Augmented_data/commission_adoption_2021.csv")
## Rows: 304 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
roadmap_2020 <- read_csv("./Augmented_data/roadmap_2020.csv")
## Rows: 123 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (11): Feedback reference, Submitted on, Submitted by, User type, Organis...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#alternative library(janitor) roadmap %>% clean_names()


scrap20 <- roadmap_2020 %>% 
  rename(country = `Country of origin`,
         id =  `Feedback reference`,
         time = `Submitted on`  ,
         person = `Submitted by` ,
         type = `User type` ,
         org = Organisation,
         size = `Organisation size` ,
         register = `Transparency register number`,
         initiative = Initiative,
         abstract = Paragraph,
         text = pdf)  %>%
  mutate(time = dmy(time)) %>% 
 mutate(type = recode(type, #old value = new value
                    `NRO (Nichtregierungsorganisation)` = "Non-governmental organisation (NGO)",
                    `Universität/Forschungseinrichtung` = "Academic/research Institution",
                    `EU-Bürger/-in` = "EU citizen" ,
                    `Sonstiges` = "Other",
                    `Unternehmen/Unternehmensverband` = "Company/business organisation",
                    `Verbraucherverband` = "Consumer organisation",
                    `Behörde` = "Public authority",
                    `Gewerkschaft` = "Trade union",
                    `Wirtschaftsverband` = "Business association",
                    `-` = "Missing"
                    )) %>% 
   mutate(size = recode(size, #old value = new value
                    `mittel (50 bis 249 Beschäftigte)` = "Medium (50 to 249 employees)",
                    `klein (10 bis 49 Beschäftigte)` = "Small (10 to 49 employees)",
                    `groß (250 oder mehr Beschäftigte)` = "Large (250 or more)",
                    `-` = "Missing",
                    `sehr klein (1 bis 9 Beschäftigte)` = "Micro (1 to 9 employees)")) %>% #I need to find this workaround, the above procedure did not function
mutate(size = case_when(str_detect(size, "mittel") ~ "Medium (50 to 249 employees)", TRUE ~ size)) %>% 
mutate(size = case_when(str_detect(size, "klein") ~ "Small (10 to 49 employees)", TRUE ~ size)) %>%   
mutate(size = case_when(str_detect(size, "sehr") ~ "Micro (1 to 9 employees)", TRUE ~ size)) %>%     
mutate(country = recode(country,
                          `Vereinigten Staaten` = "United States",
                          `Belgien` = "Belgium",
                          `Slowakei` = "Slovakia",
                          `Italien` = "Italy",
                          `Niederlande` = "Netherlands",
                          `Dänemark` = "Denmark",
                          `Vereinigtes Königreich` = "United Kingdom",
                          `Frankreich` = "France",
                          `-` = "Missing",
                          `international` = "Other",
                          `Spanien` = "Spain",
                          `Österreich` = "Austria",
                          `Schweden` = "Sweden",
                          `Polen` = "Poland",
                          `Irland` = "Ireland",
                          `Finnland` = "Finland",
                          `Deutschland` = "Germany",
                          `Ungarn` = "Hungary",
                          `Tschechien` = "Czech Republic",
                          `Rumänien` = "Romania",
                          `Bulgarien` = "Bulgaria"))
  


scrap21 <-commission_adoption_2021 %>% 
  rename(country = `Country of origin`,
         id =  `Feedback reference`,
         time = `Submitted on`  ,
         person = `Submitted by` ,
         type = `User type` ,
         org = Organisation,
         size = `Organisation size` ,
         register = `Transparency register number`,
         initiative = Initiative,
         abstract = Paragraph,
         text = pdf) %>%
  mutate(time = dmy(time)) %>% 
  mutate(type = recode(type, 
                       `Ukyo Mori` = "Other",
                       `Johannes Kröhnert` = "Other",
                         `-` = "Missing")) %>% 
   mutate(country = recode(country,
                           `Regional` = "Other",
                           `Local` = "Other",
                           `feedback.usertype.company` = "Other",
                           `feedback.usertype.business_association` = "Other",
                           `National` = "Other")) %>% 
  mutate(size = recode(size,
                       `-` = "Missing"))
  

scrap20 <- scrap20 %>% mutate(consult_round = "two")
scrap21 <- scrap21 %>% mutate(consult_round = "three")
#problem with scrap 20 and the ids: F550611 and F550610 they are doubles (with empty abstract and text section) complete entry is: F550619
# scrap 20 hast 123 rows but should have 133 !
#after filtering : 121 rows

scrap20 <- scrap20 %>% 
  filter (id != "F550611", id != "F550610")
#problem2: missing on all variables
scrap20 %>% filter(is.na(abstract))
scrap20 %>% filter(id == "-")
scrap20 <- scrap20 %>%  filter(id !="-")
#there are n = 85 pdfs in the folder but only n = 69 [text] in the csv
#there are n = 49 entries with only an abstract but no text: filter(!is.na(abstract), is.na(text))

scrap20 %>% filter(!is.na(text))

A merged data frame of two submission rounds

submission <- rbind(scrap20,scrap21)

Merged data frame of all three submission rounds

with different cell and column numbers

they share: id, time, person, type, org, size, register, country, text

tidy_df1$time<- as.Date(tidy_df1$time, "%d.%m.%Y")
tidy_df1 <- tidy_df1 %>% relocate(person, .after = time )
tidy_df1 <- tidy_df1 %>% relocate(type, .after = person )
tidy_df1 <- tidy_df1 %>% relocate(org, .after = type )
tidy_df1 <- tidy_df1 %>% relocate(size, .after = org )
tidy_df1 <- tidy_df1 %>% relocate(register, .after = size )
tidy_df1 <- tidy_df1 %>% relocate(country, .after = register )
tidy_df1 <- tidy_df1 %>% relocate(text, .after = country )
submission <- submission %>% relocate(text, .after = country )

###Merge all 3 data frames together

three_submission <- full_join( tidy_df1, submission, by = c("id", "time", "type", "size", "org", "register", "text", "consult_round", "person", "country"))
#size
three_submission <- three_submission %>% 
  mutate(size = case_when(
    size == "Large (250 or more)" ~ 4,
    size == "Medium (50 to 249 employees)" ~ 3,
    size == "Small (10 to 49 employees)" ~ 2,
    size == "Micro (1 to 9 employees)" ~ 1,
    size == "Missing" ~ 0)
    )
library(pdftools) # to read in pdfs
library(tidytext) # to tokenize text, remove stop words, and calculate tfidf
## Warning: Paket 'tidytext' wurde unter R Version 4.1.3 erstellt
library(tidyverse) # to wrangle data, count words, and plot data
library(textclean) # to clean up text a bit, removing non-ascii chars etc.

consult_text_clean <- three_submission %>%
  mutate(text = str_trim(text), #trim leading and trailing white space
         text = replace_url(text), #remove URLs from text
         text = replace_non_ascii(text), #remove non-ascii characters
         text = replace_symbol(text), #replace $ and other characters with word replacements
         text = str_remove_all(text, "[0-9]+"), #remove numbers
         text = str_remove_all(text, "[[:punct:]]+"),
         text = str_replace(text, "Ref Ares", ""),
         text = str_squish(text)) #remove extra white space from text (e.g., line breaks)) #remove punctuation
saveRDS(consult_text_clean, "consult_text_clean.rds")
write_csv(consult_text_clean, "consult_text_clean.csv")
#custom_stop_words <- tibble(word = c("canada", "canadas", "report", "cent", "gouvqcca", "crimi", "nal")) #words that may appear frequently on certain pages, but that we don't want to keep. 

consult_tokens <- consult_text_clean %>%
  unnest_tokens(word, text, token = "words", to_lower = TRUE) %>%
  anti_join(stop_words) %>% #remove English stop words (e.g., I, a, the)
 # anti_join(custom_stop_words) %>% #remove our custom stop words
 # filter(n > 50) %>% #keep only pages with more than 100 words
  count(id, word) #count the number of times each word appears on each page. We'll need this to calculate tf-idf in the next step.
## Joining, by = "word"

Deal with missing variables

library(naniar)
## Warning: Paket 'naniar' wurde unter R Version 4.1.3 erstellt
sum(is.na(consult_text_clean$text))
## [1] 895
gg_miss_var(three_submission)
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## i Please use "none" instead.
## i The deprecated feature was likely used in the naniar package.
##   Please report the issue at <https://github.com/njtierney/naniar/issues>.

library(janitor)
## Warning: Paket 'janitor' wurde unter R Version 4.1.3 erstellt
## 
## Attache Paket: 'janitor'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     chisq.test, fisher.test
consult_text_clean %>% 
  tabyl(country) %>% 
  adorn_totals("row") %>%
  adorn_pct_formatting() %>% 
  knitr::kable()
country n percent valid_percent
Albania 1 0.1% 0.1%
Austria 28 1.7% 2.3%
Belgium 265 16.0% 22.2%
Brazil 1 0.1% 0.1%
Bulgaria 7 0.4% 0.6%
Canada 4 0.2% 0.3%
China 2 0.1% 0.2%
Costa Rica 1 0.1% 0.1%
Côte d’Ivoire 1 0.1% 0.1%
Croatia 2 0.1% 0.2%
Cyprus 1 0.1% 0.1%
Czech Republic 8 0.5% 0.7%
Denmark 22 1.3% 1.8%
Finland 24 1.5% 2.0%
France 108 6.5% 9.1%
Germany 187 11.3% 15.7%
Greece 7 0.4% 0.6%
Hungary 4 0.2% 0.3%
India 5 0.3% 0.4%
Iraq 1 0.1% 0.1%
Ireland 16 1.0% 1.3%
Italy 46 2.8% 3.9%
Japan 8 0.5% 0.7%
Latvia 1 0.1% 0.1%
Lithuania 2 0.1% 0.2%
Luxembourg 3 0.2% 0.3%
Malta 3 0.2% 0.3%
Missing 11 0.7% 0.9%
Netherlands 64 3.9% 5.4%
Norway 5 0.3% 0.4%
Other 11 0.7% 0.9%
Poland 23 1.4% 1.9%
Portugal 18 1.1% 1.5%
Romania 14 0.8% 1.2%
Slovakia 5 0.3% 0.4%
South Korea 1 0.1% 0.1%
Spain 70 4.2% 5.9%
Sweden 29 1.8% 2.4%
Switzerland 16 1.0% 1.3%
Turkey 1 0.1% 0.1%
United Kingdom 78 4.7% 6.5%
United States 88 5.3% 7.4%
NA 462 27.9% -
Total 1654 100.0% 100.0%